#ifndef CPU_LE
#error Only little endian CPUs supported
#endif

#if defined(CPU_X86) && defined(COMPILER_MSC)

#ifdef COMPILER_GCC
#ifdef CPU_64
inline word   SwapEndian16(word v)    { __asm__("xchgb %b0,%h0" : "=Q" (v) :  "0" (v)); return v; }
inline int16  SwapEndian16(int16 v)   { __asm__("xchgb %b0,%h0" : "=Q" (v) :  "0" (v)); return v; }
#else
inline word   SwapEndian16(word v)    { __asm__("xchgb %b0,%h0" : "=q" (v) :  "0" (v)); return v; }
inline int16  SwapEndian16(int16 v)   { __asm__("xchgb %b0,%h0" : "=q" (v) :  "0" (v)); return v; }
#endif
inline dword  SwapEndian32(dword v)   { __asm__("bswap %0" : "=r" (v) : "0" (v)); return v; }
inline int    SwapEndian32(int v)     { __asm__("bswap %0" : "=r" (v) : "0" (v)); return v; }
#endif

#ifdef COMPILER_MSC
#pragma intrinsic (_byteswap_ushort, _byteswap_ulong, _byteswap_uint64, strlen)

inline word   SwapEndian16(word v)    { return _byteswap_ushort(v); }
inline int16  SwapEndian16(int16 v)   { return _byteswap_ushort(v); }
inline dword  SwapEndian32(dword v)   { return _byteswap_ulong(v); }
inline int    SwapEndian32(int v)     { return _byteswap_ulong(v); }
#endif

inline void   EndianSwap(word& v)     { v = SwapEndian16(v); }
inline void   EndianSwap(int16& v)    { v = SwapEndian16(v); }
inline void   EndianSwap(dword& v)    { v = SwapEndian32(v); }
inline void   EndianSwap(int& v)      { v = SwapEndian32(v); }

#else

#ifdef COMPILER_GCC

inline dword  SwapEndian32(dword v)   { return __builtin_bswap32(v); }
inline int    SwapEndian32(int v)     { return __builtin_bswap32(v); }

inline word   SwapEndian16(word v)    { return SwapEndian32(v) >> 16; } // GCC bug workaround
inline int16  SwapEndian16(int16 v)   { return SwapEndian32(v) >> 16; }

inline void   EndianSwap(word& v)     { v = SwapEndian16(v); }
inline void   EndianSwap(int16& v)    { v = SwapEndian16(v); }
inline void   EndianSwap(dword& v)    { v = SwapEndian32(v); }
inline void   EndianSwap(int& v)      { v = SwapEndian32(v); }

#else

inline void   EndianSwap(word& v)     { byte *x = (byte *)(&v); Swap(x[0], x[1]); }
inline void   EndianSwap(int16& v)    { EndianSwap(*(word *)&v); }
inline void   EndianSwap(dword& v)    { byte *x = (byte *)&v; Swap(x[0], x[3]); Swap(x[1], x[2]); }
inline void   EndianSwap(int& v)      { EndianSwap(*(dword *)&v); }
inline word   SwapEndian16(word v)    { EndianSwap(v); return v; }
inline int16  SwapEndian16(int16 v)   { EndianSwap(v); return v; }
inline dword  SwapEndian32(dword v)   { EndianSwap(v); return v; }
inline int    SwapEndian32(int v)     { EndianSwap(v); return v; }

#endif

#endif

#if defined(CPU_AMD64) && (defined(COMPILER_GCC) || defined(COMPILER_MSC))
#ifdef COMPILER_GCC
inline uint64  SwapEndian64(uint64 v) { __asm__("bswap %0" : "=r" (v) : "0" (v)); return v; }
inline int64   SwapEndian64(int64 v)  { __asm__("bswap %0" : "=r" (v) : "0" (v)); return v; }
#endif
#ifdef COMPILER_MSC
inline uint64  SwapEndian64(uint64 v) { return _byteswap_uint64(v); }
inline int64   SwapEndian64(int64 v)  { return _byteswap_uint64(v); }
#endif

inline void   EndianSwap(int64& v)    { v = SwapEndian64(v); }
inline void   EndianSwap(uint64& v)   { v = SwapEndian64(v); }

#else

#ifdef COMPILER_GCC

inline uint64  SwapEndian64(uint64 v) { return __builtin_bswap64(v); }
inline int64   SwapEndian64(int64 v)  { return __builtin_bswap64(v); }

inline void    EndianSwap(int64& v)   { v = SwapEndian64(v); }
inline void    EndianSwap(uint64& v)  { v = SwapEndian64(v); }

#else
inline void   EndianSwap(int64& v)    { byte *x = (byte *)&v; Swap(x[0], x[7]); Swap(x[1], x[6]); Swap(x[2], x[5]); Swap(x[3], x[4]); }
inline void   EndianSwap(uint64& v)   { EndianSwap(*(int64 *)&v); }
inline int64  SwapEndian64(int64 v)   { EndianSwap(v); return v; }
inline uint64 SwapEndian64(uint64 v)  { EndianSwap(v); return v; }
#endif
#endif

inline word   SwapEndian16(int w)     { return SwapEndian16((word)w); }
inline word   SwapEndian16(dword w)   { return SwapEndian16((word)w); }

void EndianSwap(word *v, size_t count);
void EndianSwap(int16 *v, size_t count);
void EndianSwap(dword *v, size_t count);
void EndianSwap(int *v, size_t count);
void EndianSwap(int64 *v, size_t count);
void EndianSwap(uint64 *v, size_t count);

// unligned access - memcpy converts to simple load/store with normal compilers

inline int    Peek16(const void *ptr)        { word x; memcpy(&x, ptr, 2); return x; }
inline int    Peek32(const void *ptr)        { dword x; memcpy(&x, ptr, 4); return x; }
inline int64  Peek64(const void *ptr)        { uint64 x; memcpy(&x, ptr, 8); return x; }

inline void   Poke16(void *ptr, word val)    { memcpy(ptr, &val, 2); }
inline void   Poke32(void *ptr, dword val)   { memcpy(ptr, &val, 4); }
inline void   Poke64(void *ptr, int64 val)   { memcpy(ptr, &val, 8); }

inline int    Peek16le(const void *ptr)      { return Peek16(ptr); }
inline int    Peek32le(const void *ptr)      { return Peek32(ptr); }
inline int64  Peek64le(const void *ptr)      { return Peek64(ptr); }

inline void   Poke16le(void *ptr, word val)   { Poke16(ptr, val); }
inline void   Poke32le(void *ptr, dword val)   { Poke32(ptr, val); }
inline void   Poke64le(void *ptr, int64 val) { Poke64(ptr, val); }

inline int    Peek16be(const void *ptr)      { return SwapEndian16(Peek16(ptr)); }
inline int    Peek32be(const void *ptr)      { return SwapEndian32(Peek32(ptr)); }
inline int64  Peek64be(const void *ptr)      { return SwapEndian64(Peek64(ptr)); }

inline void   Poke16be(void *ptr, word val)  { Poke16(ptr, SwapEndian16(val)); }
inline void   Poke32be(void *ptr, dword val) { Poke32(ptr, SwapEndian32(val)); }
inline void   Poke64be(void *ptr, int64 val) { Poke64(ptr, SwapEndian64(val)); }

#define MAKE2B(b0, b1)                            MAKEWORD(b0, b1)
#define MAKE4B(b0, b1, b2, b3)                    MAKELONG(MAKEWORD(b0, b1), MAKEWORD(b2, b3))
#define MAKE8B(b0, b1, b2, b3, b4, b5, b6, b7)    MAKEQWORD(MAKE4B(b0, b1, b2, b3), MAKE4B(b4, b5, b6, b7))

#ifdef CPU_64

#define HASH64

#define HASH_CONST1 I64(0xf7c21089bee7c0a5)
#define HASH_CONST2 I64(0xc85abc8da7534a4d)
#define HASH_CONST3 I64(0x8642b0fe3e86671b)

typedef qword hash_t;

inline dword FoldHash(qword h)
{
	return (dword)SwapEndian64(HASH_CONST3 * h);
}

#else

#define HASH_CONST1 0xbee7c0a5
#define HASH_CONST2 0xa7534a4d
#define HASH_CONST3 0x8e86671b

typedef dword hash_t;

inline dword FoldHash(dword h)
{
	return SwapEndian32(HASH_CONST3 * h);
}

#endif

inline dword FoldHash32(dword h)
{
	return SwapEndian32(0x8e86671b * h);
}

inline byte Saturate255(int x)             { return byte(~(x >> 24) & (x | (-(x >> 8) >> 24)) & 0xff); }

force_inline
int SignificantBits(dword x)
{ // basically log2(x) + 1 except that for 0 this is 0, number of significant bits of x
#ifdef COMPILER_MSC
	DWORD index;
	return _BitScanReverse(&index, x) ? index + 1 : 0;
#else
	return x ? 32 - __builtin_clz(x) : 0;
#endif
}

force_inline
int SignificantBits64(uint64 x)
{ // basically log2(x) + 1 except that for 0 this is 0, number of significant bits of x
#ifdef COMPILER_MSC
#ifdef CPU_64
	DWORD index;
	return _BitScanReverse64(&index, x) ? index + 1 : 0;
#else
	if(x & 0xffffffff00000000)
		return SignificantBits(HIDWORD(x)) + 32;
	else
		return SignificantBits((DWORD)x);
#endif
#else
	return x ? 64 - __builtin_clzll(x) : 0;
#endif
}

inline bool FitsInInt64(double x)
{
	return x >= -9223372036854775808.0 && x < 9223372036854775808.0;
}

force_inline
int CountBits(dword mask)
{
#ifndef flagLEGACY_CPU // support for old CPUs (released before 2010)
	#if COMPILER_GCC
	    return __builtin_popcount(mask);
	#elif COMPILER_MSC
	    return __popcnt(mask);
	#endif
#else
    // Fallback (unlikely)
    mask = mask - ((mask >> 1) & 0x55555555);
    mask = (mask & 0x33333333) + ((mask >> 2) & 0x33333333);
    mask = (mask + (mask >> 4)) & 0x0F0F0F0F;
    mask = mask + (mask >> 8);
    mask = mask + (mask >> 16);
    return mask & 0x3F;
#endif
}

force_inline
int CountBits64(uint64 mask)
{
#if COMPILER_GCC && !defined(flagLEGACY_CPU)
    return __builtin_popcountll(mask);
#elif COMPILER_MSC && !defined(flagLEGACY_CPU)
    #if CPU_64
        return (int)__popcnt64(mask);
    #else
        return CountBits(static_cast<dword>(mask)) +  CountBits(static_cast<dword>(mask >> 32));
    #endif
#else
    // Fallback (unlikely)
    mask = mask - ((mask >> 1) & 0x5555555555555555ULL);
    mask = (mask & 0x3333333333333333ULL) + ((mask >> 2) & 0x3333333333333333ULL);
    mask = (mask + (mask >> 4)) & 0x0F0F0F0F0F0F0F0FULL;
    mask = mask + (mask >> 8);
    mask = mask + (mask >> 16);
    mask = mask + (mask >> 32);
    return mask & 0x7F;
#endif
}

force_inline
int CountTrailingZeroBits(dword x)
{
#if COMPILER_GCC && !defined(flagLEGACY_CPU)
	return __builtin_ctz(x);
#elif COMPILER_MSC && !defined(flagLEGACY_CPU)
	unsigned long index;
	_BitScanForward(&index, x);
	return index;
#else
	// unlikely fallback
	int ret = 0;
	if((x & 0xffff) == 0) {
		x >>= 16;
		ret += 16;
	}
	if((x & 0xff) == 0) {
		x >>= 8;
		ret += 8;
	}
	if((x & 0xf) == 0) {
		x >>= 4;
		ret += 4;
	}
	if((x & 0x3) == 0) {
		x >>= 2;
		ret += 2;
	}
	if((x & 0x1) == 0)
		ret += 1;
	return ret;
#endif
}

force_inline
int CountTrailingZeroBits64(uint64 x)
{
#if COMPILER_GCC && !defined(flagLEGACY_CPU)
	return __builtin_ctzll(x);
#elif COMPILER_MSC && !defined(flagLEGACY_CPU) && CPU_64
	unsigned long index;
	_BitScanForward64(&index, x);
	return index;
#else
	// unlikely fallback
	return (x & 0xffffffff) ? CountTrailingZeroBits((dword)x) : CountTrailingZeroBits((dword)(x >> 32)) + 32;
#endif
}

#if defined(__SIZEOF_INT128__) && (__GNUC__ > 5 || __clang_major__ >= 5)

#ifdef CPU_X86

force_inline
byte addc64(uint64& result, const uint64& value, byte carry) {
	return _addcarry_u64(carry, result, value, &result);
}

#else

force_inline
byte addc64(uint64& r, uint64 a, byte carry)
{
	r += a + carry;
	return carry ? r <= a : r < a;
}

#endif

force_inline
uint64 mul64(uint64 a, uint64 b, uint64& hi)
{
	unsigned __int128 prod =  (unsigned __int128)a * b;
	hi = prod >> 64;
	return prod;
}

#elif defined(COMPILER_MSC) && defined(CPU_64)

force_inline
uint64 mul64(uint64 a, uint64 b, uint64& hi)
{
	return _umul128(a, b, &hi);
}

force_inline
byte addc64(uint64& result, const uint64& value, byte carry) {
	return _addcarry_u64(carry, result, value, &result);
}

#else

force_inline
byte addc64(uint64& r, uint64 a, byte carry)
{
	r += a + carry;
	return carry ? r <= a : r < a;
}

force_inline
uint64 mul64(uint64 a, uint64 b, uint64& hi)
{
	uint64 lo_lo = (a & 0xFFFFFFFF) * (b & 0xFFFFFFFF);
	uint64 hi_lo = (a >> 32)        * (b & 0xFFFFFFFF);
	uint64 lo_hi = (a & 0xFFFFFFFF) * (b >> 32);
	uint64 hi_hi = (a >> 32)        * (b >> 32);
	
	uint64 cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi;
	hi = (hi_lo >> 32) + (cross >> 32)        + hi_hi;

	return (cross << 32) | (lo_lo & 0xFFFFFFFF);
}

#endif
